In this notebook we will be analysing the Crime Incident Reports of the city of Boston, US, in the years 2016-2018. The actual dataset contains records for events from June 2015 to January, 2019. This report is provided by the Boston Police Department (BPD) "to document the initial details surrounding an incident to which BPD officers respond" Source.
Since there are more than 355k events registered in this dataset, we will only be focusing at the incidents related to motor vehicles.
Police responses are logged and reported chronologically by the Boston Police Department.
We will be focusing on the "OFFENSE_CODE_GROUP" column/feature, which is the "internal categorization" of the incident.
The "OCCURED_ON_DATE" will be one of our most valuable assets. We will look at days, months and years. The "INCIDENT_NUMBER" will help us with counting events. "SHOOTING" indicates if a shooting took place.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
%matplotlib inline
import os
# suppress warnings from final output
import warnings
warnings.simplefilter("ignore")
df = pd.read_csv('crime.csv', engine='python')
df.shape
df.head()
df.info()
df['OFFENSE_CODE_GROUP'].value_counts().head(10)
sns.catplot(y='OFFENSE_CODE_GROUP',
kind='count',
height=8,
aspect=1.5,
order=df['OFFENSE_CODE_GROUP'].value_counts().head(10).index,
data=df);
auto = df[df['OFFENSE_CODE_GROUP'].str.contains(r'Motor|Vehicle|Accident|Tow|License Plate|Auto|M/V|Traffic|Speeding', regex=True)]
auto.head(3)
# Let's Convert the event date to datetime format
auto['OCCURRED_ON_DATE'] = pd.to_datetime(auto['OCCURRED_ON_DATE'])
# We will fill empty values with "N" for no shootings.
# "Y" means the incident were realted to a shooting
auto.SHOOTING.fillna('N', inplace=True)
# We will convert the 'DAY_OF_WEEK' column to be a category.
auto.DAY_OF_WEEK = pd.Categorical(auto.DAY_OF_WEEK,
categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],
ordered=True)
# Let's also replace the placeholder values of 'Lat' and 'Long'.
auto.Lat.replace(-1, None, inplace=True)
auto.Long.replace(-1, None, inplace=True)
# Let's also drop some columns we won't be using.
auto = auto.drop(['REPORTING_AREA', 'UCR_PART', 'DISTRICT'], axis=1)
*Please keep in mind the visualizations below will display event counts for all three years, unless otherwise specified.
auto = auto[(auto.YEAR > 2015) & ((auto.YEAR < 2019))]
sns.set(rc={'figure.figsize':(15,6)})
sns.countplot(x='OFFENSE_CODE_GROUP',data=auto, order=auto['OFFENSE_CODE_GROUP'].value_counts().index)
plt.xticks(rotation=45)
plt.ylabel('No of Incidents')
plt.xlabel("");
plt.title("Incident Categories", size=35)
plt.show()
sns.catplot(x='HOUR',
kind='count',
height=8.27,
aspect=3,
color='lightblue',
data=auto)
plt.xticks(size=20)
plt.yticks(size=20)
plt.xlabel('Hour', fontsize=30);
plt.ylabel('Count', fontsize=30);
plt.title("Incidents per Hour in the Day", size=50);
auto.groupby([auto['OCCURRED_ON_DATE'].dt.hour,'OFFENSE_CODE_GROUP',])['INCIDENT_NUMBER'].count().unstack().plot(marker='o', figsize=(15,10))
plt.ylabel('No of Incidents');
plt.xlabel('Hour of the day');
plt.legend(fontsize="x-large");
plt.xticks(np.arange(24));
plt.title("Incidents per Hour in the Day", size=40);
sns.catplot(x='DAY_OF_WEEK',
kind='count',
height=8,
aspect=3,
data=auto)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('');
plt.ylabel('Count', fontsize=40);
plt.title("Incidents per Day of the Week", size=50);
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
sns.catplot(x='MONTH',
kind='count',
height=8,
aspect=3,
color='lightblue',
data=auto)
plt.xticks(np.arange(12), months, size=30)
plt.yticks(size=30)
plt.xlabel('');
plt.ylabel('Count', fontsize=40);
plt.title("Incidents per Months in the Year", size=50);
months=['', 'Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
auto.groupby('MONTH')['INCIDENT_NUMBER'].count().plot(marker='o', color='red', linewidth=2, markersize=12, markerfacecolor='lightblue', figsize=(15, 5))
plt.xticks(np.arange(0,13, 1),months)
plt.ylabel('No of Incidents');
plt.title("Incidents per Months in the Year", size=40);
months=['','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
auto.groupby([auto['OCCURRED_ON_DATE'].dt.month,'OFFENSE_CODE_GROUP',])['INCIDENT_NUMBER'].count().unstack().plot(marker='o', figsize=(15,10));
plt.ylabel('No of Incidents');
plt.legend(fontsize="x-large");
plt.xlabel("")
plt.xticks(np.arange(0, 13, 1),months);
plt.title("Type of Incidents per Months in the Year", size=37);
The only month were the number of incidents was higher in 2018, relative to 2017, were June and November.
auto.groupby(['MONTH','YEAR'])['INCIDENT_NUMBER'].count().unstack().plot(kind='bar', figsize=(15, 6));
plt.ylabel('No of Incidents');
plt.xlabel("Month of the year");
plt.legend(loc='center left', bbox_to_anchor=(1, .8), fontsize="x-large");
plt.title("Type of Incidents per Month by Year", size=37);
# Motor Vehicle related incidents that involved shootings:
auto[auto.SHOOTING== "Y"]
sns.set(rc={'figure.figsize':(10,6)})
sns.countplot(x='OFFENSE_CODE_GROUP',data=auto[auto.SHOOTING== "Y"], order=auto['OFFENSE_CODE_GROUP'].value_counts().index)
plt.xticks(rotation=45)
plt.title('No of Incidents where there were shootings')
plt.show()
auto = auto.reset_index(drop=True)
# Create basic Folium crime map
crime_map = folium.Map(location=[42.3125,-71.0875],
zoom_start = 11)
# Add data for heatmp
auto_heatmap = auto[['Lat','Long']]
auto_heatmap = auto.dropna(axis=0, subset=['Lat','Long'])
auto_heatmap = [[row['Lat'],row['Long']] for index, row in auto_heatmap.iterrows()]
HeatMap(auto_heatmap[:50000], radius=10).add_to(crime_map)
# Plot!
crime_map
# These are the last 2000 incidents. Looks like these are all over the city.
map = folium.Map(width=800,
height=500,
location=[42.33, -71.070],
zoom_start=12)
count=0
for i in range(0,len(auto)):
try:
folium.Marker([auto.iloc[i]['Lat'], auto.iloc[i]['Long']], popup=auto.iloc[i]['STREET']).add_to(map)
except:
pass
count +=1
if count > 2000:
break
map
# Most of the events reported don't have the coordinates.
auto.Location.value_counts().head(7)
# Let's take a look at the top 5 coordinates in our dataset.
#We skip the first 2 because the are placeholder values*
temp=(auto.Location.value_counts() < 142).index
temp = temp[2:7].tolist()
temp
# The furthest you stay away from these coordinates, the safer!
map = folium.Map(width=800,height=500,location=[42.33, -71.070], zoom_start=12)
folium.Marker([42.32696647, -71.06198607]).add_to(map)
folium.Marker([42.33152148, -71.07085307]).add_to(map)
folium.Marker([42.36067984, -71.05482325]).add_to(map)
folium.Marker([42.32809966, -71.06321676]).add_to(map)
folium.Marker([42.36183857, -71.05976489]).add_to(map)
map